# coding:utf-8

import scrapy
import bs4
from scrapy.contrib.spiders import CrawlSpider
from ..items import ChengmaiItem


class ChengmaiSpider(CrawlSpider):
    #
    # 抓取安居客网站规则
    #

    name = 'chengmaispider'
    allowed_domains = ['cmaif.com']
    start_urls = ['http://www.cmaif.com/house/']

    def parse(self, response):

        #
        # 对网站页面翻页
        #

        citys = {
            'website': '澄迈网', 'web_url': 'www.cmaif.com/house/',
            'city': '澄迈', 'city_id': 'cmaif'
        }
        for n in range(10):
            site_url = 'http://www.cmaif.com/house/' + str(n + 1) + '/'
            yield scrapy.Request(site_url,
                                 callback=self.parse_citypage_url,
                                 meta=citys)


    def parse_citypage_url(self, response):
        meta = response.meta
        data = response.body
        soup = bs4.BeautifulSoup(data, 'lxml')

        #
        # 获取楼盘信息
        #

        estates = soup.select('.houselistleft > dl')
        for child in estates:
            col = child.find_all('a')[0]
            estate = col.get_text()
            estates_url = 'http://www.cmaif.com' + col.get('href')
            estate_id = col.get('href').replace('/house/', '').replace('.html', '')
            item = ChengmaiItem()
            item['city'] = meta['city']
            item['city_id'] = meta['city_id']
            item['website'] = meta['website']
            item['web_url'] = meta['web_url']
            item['area'] = ''
            item['estate'] = estate.strip()
            item['estate_id'] = estate_id.strip()
            item['estate_url'] = estates_url.strip()
            yield item

